import plotly
import plotly.graph_objs as go
import plotly.express as px
from plotly.subplots import make_subplots
import numpy as np
import pandas as pd
df = pd.read_csv('./data/netflix_titles.csv')
df.shape
(8807, 12)
df = df.dropna( how='any',subset=['cast', 'director'])
df = df.dropna()
# converting into proper date time format
df["date_added"] = pd.to_datetime(df['date_added'])
df['year_added'] = df['date_added'].dt.year
df['month_added'] = df['date_added'].dt.month
df.head()
| show_id | type | title | director | cast | country | date_added | release_year | rating | duration | listed_in | description | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 7 | s8 | Movie | Sankofa | Haile Gerima | Kofi Ghanaba, Oyafunmike Ogunlano, Alexandra D... | United States, Ghana, Burkina Faso, United Kin... | September 24, 2021 | 1993 | TV-MA | 125 min | Dramas, Independent Movies, International Movies | On a photo shoot in Ghana, an American model s... |
| 8 | s9 | TV Show | The Great British Baking Show | Andy Devonshire | Mel Giedroyc, Sue Perkins, Mary Berry, Paul Ho... | United Kingdom | September 24, 2021 | 2021 | TV-14 | 9 Seasons | British TV Shows, Reality TV | A talented batch of amateur bakers face off in... |
| 9 | s10 | Movie | The Starling | Theodore Melfi | Melissa McCarthy, Chris O'Dowd, Kevin Kline, T... | United States | September 24, 2021 | 2021 | PG-13 | 104 min | Comedies, Dramas | A woman adjusting to life after a loss contend... |
| 12 | s13 | Movie | Je Suis Karl | Christian Schwochow | Luna Wedler, Jannis Niewöhner, Milan Peschel, ... | Germany, Czech Republic | September 23, 2021 | 2021 | TV-MA | 127 min | Dramas, International Movies | After most of her family is murdered in a terr... |
| 24 | s25 | Movie | Jeans | S. Shankar | Prashanth, Aishwarya Rai Bachchan, Sri Lakshmi... | India | September 21, 2021 | 1998 | TV-14 | 166 min | Comedies, International Movies, Romantic Movies | When the father of the man she loves insists t... |
x = np.arange(0, 5, 0.1)
def square(x):
return x**2
def cubic(x):
return 5 * x**0.5
Plotly themes:
fig = go.Figure()
fig.add_trace(go.Scatter(x=x, y=square(x), mode='lines+markers' , name='f(x)=x<sup>2</sup>')) # use <sup> because LATEX doens't work for hint
fig.add_trace(go.Scatter(x=x, y=x, name='$$g(x)=x$$'))
fig.add_trace(go.Scatter(x=x, y=cubic(x), mode='markers' , name='f(x)=x<sup>3</sup>'))
fig.update_layout(legend_orientation="h",
legend=dict(x=.1, xanchor="center"),
title="Plot Title",
xaxis_title="x Axis Title",
yaxis_title="y Axis Title",
template = 'plotly_white',
margin=dict(l=5, r=5, t=30, b=20) # by default 20 px
)
fig.show()
np.random.seed(0)
x_var = np.random.normal(size = 6000)
y_var = np.random.normal(size = 6000)
norm_data = pd.DataFrame({'x_var':x_var
,'y_var':y_var}
)
norm_data = norm_data.assign(category_var = np.where(x_var > 1, "Category A","Category B"))
fig = px.scatter(data_frame = norm_data
,x = 'x_var'
,y = 'y_var'
,color = 'category_var'
,opacity = .2,
)
fig.update_layout(title="Scatter Plot",
xaxis_title="x Axis Title",
yaxis_title="y Axis Title",
template = 'plotly_white',
margin=dict(l=5, r=5, t=30, b=20),
width=800, height=400
)
fig.show()
fig = make_subplots(rows=2, cols=2,
specs=[[{"rowspan": 2}, {}], [None, {}]])
fig.update_yaxes(range=[-0.5, 1.5], zeroline=True, zerolinewidth=2, zerolinecolor='LightPink', col=2)
fig.update_xaxes(range=[-0.5, 1.5], zeroline=True, zerolinewidth=2, zerolinecolor='#008000', col=2)
fig.add_trace(go.Scatter(x=x, y=np.sin(x), name='sin(x)'), 2, 2)
fig.add_trace(go.Scatter(x=x, y=np.cos(x), name='cos(x)'), 2, 2)
fig.add_trace(go.Scatter(x=x, y=np.tan(x), name='tg(x)'), 1, 1)
fig.add_trace(go.Scatter(x=x, y=square(x), mode='lines+markers', name='f(x)=x<sup>2</sup>'), 1, 2)
fig.add_trace(go.Scatter(x=x, y=x, mode='markers',name='g(x)=x',
marker=dict(color='LightSkyBlue', size=10, line=dict(color='MediumPurple', width=3))), 1, 2)
fig.update_layout(legend_orientation="h",
legend=dict(x=.5, xanchor="center"),
hovermode="x",
margin=dict(l=0, r=0, t=0, b=0))
fig.update_traces(hoverinfo="all", hovertemplate="Value: %{x}<br>Function: %{y}")
fig.show()
df_rating = pd.DataFrame(df['rating'].value_counts()).reset_index().rename(columns={'index':'rating','rating':'count'})
df_rating.head()
| rating | count | |
|---|---|---|
| 0 | TV-MA | 1822 |
| 1 | TV-14 | 1214 |
| 2 | R | 778 |
| 3 | PG-13 | 470 |
| 4 | TV-PG | 431 |
fig_bar = px.bar(df_rating, y='rating', x='count',
title='Distribution of Rating',
# color_discrete_sequence=['#b20710'],
text='count'
)
fig_bar.update_xaxes(visible=False)
fig_bar.update_yaxes(showgrid=False,
categoryorder='total ascending',
ticksuffix=' ',
showline=False)
fig_bar.update_traces(hovertemplate=None, marker=dict(line=dict(width=0)))
fig_bar.update_layout(margin=dict(t=40, b=0, l=0, r=0),
hovermode="y unified",
xaxis_title=' ', yaxis_title=" ",
title_font=dict(size=25),
legend=dict(orientation="h", yanchor="bottom", y=1, xanchor="center", x=0.5),
)
fig_stack_bar = px.histogram(df, y='rating', color='type', title='Which has the highest Rating TV shows or Movies?',
color_discrete_sequence=['#b20710', 'orange'])
fig_stack_bar.update_xaxes(visible=False)
fig_stack_bar.update_yaxes(showgrid=False, categoryorder='total ascending', ticksuffix=' ', showline=False)
fig_stack_bar.update_traces(hovertemplate=None, marker=dict(line=dict(width=0)))
fig_stack_bar.update_layout(margin=dict(t=70, b=0, l=0, r=0),
hovermode="y unified",
xaxis_title=' ', yaxis_title=" ",
title_font=dict(size=25),
legend=dict(orientation="h", yanchor="bottom", y=1, xanchor="center", x=0.5),
)
fig_group_bar = px.histogram(df, y='rating', color='type', title='Which has the highest Rating TV shows or Movies?',
color_discrete_sequence=['#b20710', 'orange'], barmode='group')
fig_group_bar.update_xaxes(showgrid=False)
fig_group_bar.update_yaxes(showgrid=False, categoryorder='total ascending', ticksuffix=' ', showline=False)
fig_group_bar.update_traces(hovertemplate=None, marker=dict(line=dict(width=0)))
fig_group_bar.update_layout(margin=dict(t=70, b=0, l=0, r=0),
hovermode="y unified",
xaxis_title=' ', yaxis_title=" ",
title_font=dict(size=25),
legend=dict(orientation="h", yanchor="bottom", y=1, xanchor="center", x=0.5),
)
# making a copy of df
dff = df.copy()
# making 2 df one for tv show and another for movie with rating
df_tv_show = dff[dff['type']=='TV Show'][['rating', 'type']].rename(columns={'type':'tv_show'})
df_movie = dff[dff['type']=='Movie'][['rating', 'type']].rename(columns={'type':'movie'})
df_tv_show = pd.DataFrame(df_tv_show.rating.value_counts()).reset_index().rename(columns={'index':'tv_show'})
df_tv_show['rating_final'] = df_tv_show['rating']
# making rating column value negative
df_tv_show['rating'] *= -1
df_movie = pd.DataFrame(df_movie.rating.value_counts()).reset_index().rename(columns={'index':'movie'})
df_movie = df_movie.sort_values(by=['rating']) # sort for order on plot
fig = make_subplots(rows=1, cols=2, specs=[[{}, {}]], shared_yaxes=True, horizontal_spacing=0)
# bar plot for movies
fig.append_trace(go.Bar(x=df_movie.rating, y=df_movie.movie, orientation='h', showlegend=True, text=df_movie.rating,
name='Movie', marker_color='#b20710'), 1, 2)
# bar plot for tv shows
fig.append_trace(go.Bar(x=df_tv_show.rating, y=df_tv_show.tv_show, orientation='h', showlegend=True,
text=df_tv_show.rating_final, name='TV Show', marker_color='#221f1f'), 1, 1)
fig.update_xaxes(showgrid=False)
fig.update_yaxes(showgrid=False, categoryorder='total ascending', ticksuffix=' ', showline=False)
fig.update_traces(hovertemplate=None, marker=dict(line=dict(width=0)))
fig.update_layout(title='Which has the highest rating TV shows or Movies?',
margin=dict(t=80, b=0, l=0, r=0),
hovermode="y unified",
xaxis_title=' ', yaxis_title=" ",
title_font=dict(size=25),
legend=dict(orientation="h", yanchor="bottom", y=1, xanchor="center", x=0.5),
)
df2 = df[df["type"] == "Movie"]
df3 = d2["year_added"].value_counts().reset_index().rename(columns = {col : "count", "index" : col})
df3 = df3.sort_values(col)
df3['diff'] = df3['count'].diff()
df3['diff'] = df3['diff'].fillna(0).apply(int)
df3 = df3.reset_index(drop=True)
fig2 = go.Figure(go.Waterfall(
name = "Movie", orientation = "v",
x = df3.year_added.apply(str).to_list(),
textposition = "auto",
text = df3['diff'].abs().apply(str).to_list(),
y = df3['diff'].to_list(),
increasing = {"marker":{"color":"green"}},
decreasing = {"marker":{"color":"red"}},
))
fig2.update_xaxes(showgrid=False)
fig2.update_yaxes(showgrid=False, visible=False)
fig2.update_traces(hovertemplate=None)
fig2.update_layout(title='Added movies by year', height=350,
margin=dict(t=60, b=0, l=0, r=0),
hovermode="x unified",
xaxis_title=' ', yaxis_title=" ",
title_font=dict(size=25),
)
df3['measure'] = None
df3.loc[0, 'measure'] = 'absolute'
# df3.loc[df3.shape[0] - 1, 'measure'] = 'total'
df3['measure'] = df3['measure'].fillna('relative')
df3
| year_added | count | diff | measure | |
|---|---|---|---|---|
| 0 | 2008 | 1 | 0 | absolute |
| 1 | 2009 | 2 | 1 | relative |
| 2 | 2010 | 1 | -1 | relative |
| 3 | 2011 | 13 | 12 | relative |
| 4 | 2012 | 3 | -10 | relative |
| 5 | 2013 | 6 | 3 | relative |
| 6 | 2014 | 14 | 8 | relative |
| 7 | 2015 | 47 | 33 | relative |
| 8 | 2016 | 195 | 148 | relative |
| 9 | 2017 | 702 | 507 | relative |
| 10 | 2018 | 1085 | 383 | relative |
| 11 | 2019 | 1236 | 151 | relative |
| 12 | 2020 | 1151 | -85 | relative |
| 13 | 2021 | 729 | -422 | relative |
fig2 = go.Figure(go.Waterfall(
name = "Movie", orientation="v",
x = df3.year_added.apply(str).to_list() + ['Total'],
textposition="auto",
text = df3['diff'].abs().apply(str).to_list(),
y = df3['diff'].to_list() + [None],
increasing={"marker":{"color":"green"}},
decreasing={"marker":{"color":"red"}},
measure=df3['measure'].to_list() + ['total']
))
fig2.update_xaxes(showgrid=False)
fig2.update_yaxes(showgrid=False, visible=False)
fig2.update_traces(hovertemplate=None)
fig2.update_layout(title='Added movies by year', height=350,
margin=dict(t=60, b=0, l=0, r=0),
hovermode="x unified",
xaxis_title=' ', yaxis_title=" ",
title_font=dict(size=25),
)
# data
label = ["ZERO", "ONE", "TWO", "THREE", "FOUR", "FIVE"]
source = [0, 0, 1, 1, 0] # index from
target = [2, 3, 4, 5, 4] # index to
value = [8, 2, 2, 8, 4]
# data to dict, dict to sankey
link = dict(source = source, target = target, value = value)
node = dict(label = label, pad=50, thickness=5)
data = go.Sankey(link = link, node=node)
# plot
fig = go.Figure(data)
fig.show()
clients = pd.DataFrame({'clientId': range(1, 20), 'id': 1})
dates = pd.DataFrame({'date': range(2017, 2023), 'id': 1})
df = dates.merge(clients)
df = df.drop(['id'], axis=1)
types = {
0: 'New',
1: 'Not active',
2: 'Good',
3: 'Bad',
}
# Create clients type history for each date
df['type'] = None
df.loc[(df.date == 2017) & (df.clientId.isin([i for i in range(1, 11)])), 'type'] = 0
df.loc[(df.date == 2017) & (df.clientId.isin([i for i in range(11, 20)])), 'type'] = 1
df.loc[(df.date == 2018) & (df.clientId.isin([i for i in range(1, 4)])), 'type'] = 2
df.loc[(df.date == 2018) & (df.clientId.isin([i for i in range(4, 20)])), 'type'] = 1
df.loc[(df.date == 2019) & (df.clientId.isin([i for i in range(1, 6)])), 'type'] = 2
df.loc[(df.date == 2019) & (df.clientId.isin([i for i in range(6, 10)])), 'type'] = 3
df.loc[(df.date == 2019) & (df.clientId.isin([i for i in range(10, 20)])), 'type'] = 1
df.loc[(df.date == 2020) & (df.clientId.isin([i for i in range(1, 20)])), 'type'] = 1
df.loc[(df.date == 2021) & (df.clientId.isin([i for i in range(1, 8)])), 'type'] = 2
df.loc[(df.date == 2021) & (df.clientId.isin([i for i in range(8, 10)])), 'type'] = 3
df.loc[(df.date == 2021) & (df.clientId.isin([i for i in range(10, 20)])), 'type'] = 1
df.loc[(df.date == 2022) & (df.clientId.isin([i for i in range(1, 10)])), 'type'] = 1
df.loc[(df.date == 2022) & (df.clientId.isin([i for i in range(10, 20)])), 'type'] = 2
# add next year type
df['next_date'] = df['date'] + 1
df2 = df.merge(df.loc[:, ['date', 'type', 'clientId']].rename({'date': 'next_date', 'type': 'type_next'}, axis=1),
how='left', on=['next_date', 'clientId'])
df2.head()
| date | clientId | type | next_date | type_next | |
|---|---|---|---|---|---|
| 0 | 2017 | 1 | 0 | 2018 | 2 |
| 1 | 2017 | 2 | 0 | 2018 | 2 |
| 2 | 2017 | 3 | 0 | 2018 | 2 |
| 3 | 2017 | 4 | 0 | 2018 | 1 |
| 4 | 2017 | 5 | 0 | 2018 | 1 |
# Create column that has date and type
df2['type_with_date'] = df2['date'].apply(str) + '_' + df2['type'].apply(str)
df2['type_next_with_date'] = df2['next_date'].apply(str) + '_' + df2['type_next'].apply(str)
df2 = df2.loc[~df2['type_next'].isna()].copy()
df2.head()
| date | clientId | type | next_date | type_next | type_with_date | type_next_with_date | |
|---|---|---|---|---|---|---|---|
| 0 | 2017 | 1 | 0 | 2018 | 2 | 2017_0 | 2018_2 |
| 1 | 2017 | 2 | 0 | 2018 | 2 | 2017_0 | 2018_2 |
| 2 | 2017 | 3 | 0 | 2018 | 2 | 2017_0 | 2018_2 |
| 3 | 2017 | 4 | 0 | 2018 | 1 | 2017_0 | 2018_1 |
| 4 | 2017 | 5 | 0 | 2018 | 1 | 2017_0 | 2018_1 |
# Calculate count of each transition
counts = df2.loc[:, ['date', 'type_with_date', 'type_next_with_date']].value_counts()
df_counts = pd.DataFrame(counts)\
.reset_index()\
.rename({0: 'cnt'}, axis=1)\
.sort_values(by=['date'])
df_counts
| date | type_with_date | type_next_with_date | cnt | |
|---|---|---|---|---|
| 4 | 2017 | 2017_1 | 2018_1 | 9 |
| 5 | 2017 | 2017_0 | 2018_1 | 7 |
| 11 | 2017 | 2017_0 | 2018_2 | 3 |
| 0 | 2018 | 2018_1 | 2019_1 | 10 |
| 9 | 2018 | 2018_1 | 2019_3 | 4 |
| 12 | 2018 | 2018_2 | 2019_2 | 3 |
| 13 | 2018 | 2018_1 | 2019_2 | 2 |
| 1 | 2019 | 2019_1 | 2020_1 | 10 |
| 8 | 2019 | 2019_2 | 2020_1 | 5 |
| 10 | 2019 | 2019_3 | 2020_1 | 4 |
| 2 | 2020 | 2020_1 | 2021_1 | 10 |
| 6 | 2020 | 2020_1 | 2021_2 | 7 |
| 14 | 2020 | 2020_1 | 2021_3 | 2 |
| 3 | 2021 | 2021_1 | 2022_2 | 10 |
| 7 | 2021 | 2021_2 | 2022_1 | 7 |
| 15 | 2021 | 2021_3 | 2022_1 | 2 |
# Get unique id for every transition
all_types = df_counts['type_with_date'].to_list() + df_counts['type_next_with_date'].to_list()
all_types = set(all_types) # drop duplicates
all_types = sorted(list(all_types))
d = {}
for i in range(len(all_types)):
d[all_types[i]] = i
d
{'2017_0': 0,
'2017_1': 1,
'2018_1': 2,
'2018_2': 3,
'2019_1': 4,
'2019_2': 5,
'2019_3': 6,
'2020_1': 7,
'2021_1': 8,
'2021_2': 9,
'2021_3': 10,
'2022_1': 11,
'2022_2': 12}
# Give unique id for each transition (for plotly)
df_counts['type_final'] = df_counts['type_with_date'].apply(lambda x: d[x])
df_counts['type_next_final'] = df_counts['type_next_with_date'].apply(lambda x: d[x])
df_counts.head()
| date | type_with_date | type_next_with_date | cnt | type_final | type_next_final | |
|---|---|---|---|---|---|---|
| 4 | 2017 | 2017_1 | 2018_1 | 9 | 1 | 2 |
| 5 | 2017 | 2017_0 | 2018_1 | 7 | 0 | 2 |
| 11 | 2017 | 2017_0 | 2018_2 | 3 | 0 | 3 |
| 0 | 2018 | 2018_1 | 2019_1 | 10 | 2 | 4 |
| 9 | 2018 | 2018_1 | 2019_3 | 4 | 2 | 6 |
source = df_counts['type_final'].to_list()
target = df_counts['type_next_final'].to_list()
value = df_counts['cnt'].to_list()
labels = list(d.keys())
# data to dict, dict to sankey
link = dict(source = source, target = target, value = value)
node = dict(label=labels, pad=15, thickness=5)
data = go.Sankey(link = link, node=node)
# plot
fig = go.Figure(data)
fig.show()
plotly.offline.init_notebook_mode()